$\newcommand{\vect}[1]{{\mathbf{\boldsymbol{#1}} }}$ $\newcommand{\amax}{{\text{argmax}}}$ $\newcommand{\P}{{\mathbb{P}}}$ $\newcommand{\E}{{\mathbb{E}}}$ $\newcommand{\R}{{\mathbb{R}}}$ $\newcommand{\Z}{{\mathbb{Z}}}$ $\newcommand{\N}{{\mathbb{N}}}$ $\newcommand{\C}{{\mathbb{C}}}$ $\newcommand{\abs}[1]{{ \left| #1 \right| }}$ $\newcommand{\simpl}[1]{{\Delta^{#1} }}$

Anomaly Detection via Density Estimation

Snow
import ipywidgets as widgets
import itertools as it
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px


from ipywidgets import interact
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelBinarizer
from sklearn.ensemble import IsolationForest
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.neighbors import KernelDensity
from tensorflow import keras
from tqdm import tqdm

from tfl_training_anomaly_detection.exercise_tools import (
    evaluate, 
    get_kdd_data, 
    get_house_prices_data, 
    create_distributions, 
    contamination, 
    perform_rkde_experiment, 
    get_mnist_data
)
from tfl_training_anomaly_detection.vae import VAE, build_decoder_mnist, build_encoder_minst, build_contaminated_minst

%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (5, 5)

Exercise: Play with the parameters!

We've prepared a toy dataset and the required helper functions for you. Your task is to choose a proper kernel and bandwidth in order to define a suitable density model.

dists = create_distributions(dim=2, dim_irrelevant=0)

sample_train = dists['Double Blob'].sample(500)
X_train = sample_train[-1]
y_train = [0]*len(X_train)

plt.scatter(X_train[:,0], X_train[:,1], c='blue', s=10)
plt.show()
# Helper function
def fit_kde(kernel: str, bandwidth: float, X_train: np.array) -> KernelDensity:
    """ Fit KDE
    
    @param kernel: kernel
    @param bandwidth: bandwidth
    @param x_train: data
    """
    kde = KernelDensity(kernel=kernel, bandwidth=bandwidth)
    kde.fit(X_train)
    return kde

def visualize_kde(kde: KernelDensity, bandwidth: float, X_test: np.array, y_test: np.array) -> None:
    """Plot KDE
    
    @param kde: KDE
    @param bandwidth: bandwidth
    @param X_test: test data
    @param y_test: test label
    """
    fig, axis = plt.subplots(figsize=(5, 5))

    lin = np.linspace(-10, 10, 50)
    grid_points = list(it.product(lin, lin))
    ys, xs = np.meshgrid(lin, lin)
    # The score function of sklearn returns log-densities
    scores = np.exp(kde.score_samples(grid_points)).reshape(50, 50)
    colormesh = axis.contourf(xs, ys, scores)
    fig.colorbar(colormesh)
    axis.set_title('Density Conturs (Bandwidth={})'.format(bandwidth))
    axis.set_aspect('equal')
    color = ['blue' if i ==0 else 'red' for i in y_test]
    plt.scatter(X_test[:, 0], X_test[:, 1], c=color)
    plt.show()

We provide you with a selection of popular kernels and a range for the bandwidth. Play with the parameters and inspect the results.

ker = None
bdw = None
@interact(
    kernel=['gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', 'cosine'],
    bandwidth=(.1, 10.)
)
def set_kde_params(kernel: str, bandwidth: float) -> None:
    """Helper funtion to set widget parameters
    
    @param kernel: kernel
    @param bandwidth: bandwidth
    """
    global ker, bdw

    ker = kernel
    bdw = bandwidth
kde = fit_kde(ker, bdw, X_train)
visualize_kde(kde, bdw, X_train, y_train)

Exercise: Choose the KDE Parameters

We've prepared a toy dataset in a similar fashion to the above. Now it's your task to define the search space for suitable kernels and bandwidth parameters.

Use the helper functions to inspect the performance of your density estimator.

# Generate example
dists = create_distributions(dim=2)

distribution_with_anomalies = contamination(
    nominal=dists['Sinusoidal'],
    anomaly=dists['Blob'],
    p=0.05
)

# Train data
sample_train = dists['Sinusoidal'].sample(500)
X_train = sample_train[-1].numpy()

# Test data
sample_test = distribution_with_anomalies.sample(500)
X_test = sample_test[-1].numpy()
y_test = sample_test[0].numpy()

scatter = plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test)
handels, _ = scatter.legend_elements()
plt.legend(handels, ['Nominal', 'Anomaly'])
plt.gca().set_aspect('equal')
plt.show()
param_space = {
    # todo: define the search space for the kernel and the bandwidth
}
param_space = {
    'kernel': ['gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', 'cosine'], # Add available kernels
    'bandwidth': np.linspace(0.1, 10, 100), # Define Search space for bandwidth parameter
}
param_space = {
    'kernel': ['gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', 'cosine'], # Add available kernels
    'bandwidth': np.linspace(0.1, 10, 100), # Define Search space for bandwidth parameter
}
def hyperopt_by_score(X_train: np.array, param_space: dict, cv: int=5):
    """Performs hyperoptimization by score
    
    @param X_train: data
    @param param_space: parameter space
    @param cv: number of cv folds
    """
    kde = KernelDensity()

    search = RandomizedSearchCV(
        estimator=kde,
        param_distributions=param_space,
        n_iter=100,
        cv=cv,
        scoring=None # use estimators internal scoring function, i.e. the log-probability of the validation set for KDE
    )

    search.fit(X_train)
    return search.best_params_, search.best_estimator_

Run the code below to perform hyperparameter optimization.

params, kde = hyperopt_by_score(X_train, param_space)

print('Best parameters:')
for key in params:
    print('{}: {}'.format(key, params[key]))

test_scores = -kde.score_samples(X_test)
test_scores = np.where(test_scores == np.inf, np.max(test_scores[np.isfinite(test_scores)])+1, test_scores)

curves = evaluate(y_test, test_scores)
Best parameters:
kernel: exponential
bandwidth: 0.1
/Users/maternus/.pyenv/versions/3.10.13/lib/python3.10/site-packages/sklearn/model_selection/_search.py:1051: UserWarning: One or more of the test scores are non-finite: [-583.82004189 -460.99529797 -481.91244569 -500.00511672 -545.60663629
 -561.06553923 -393.29051279 -436.60580108 -508.03454897          -inf
 -452.96132239 -473.5478597  -460.29104963 -522.20259792 -598.90918215
 -467.81178514 -648.98516413 -484.85010317 -534.20785976 -353.24010639
 -497.92497063 -665.66631435 -387.67311697 -615.48543055 -513.96813478
 -487.00829289 -497.7296008  -575.93859803 -518.8042597           -inf
 -477.37083666          -inf -566.16662463 -633.45918335 -457.79561482
 -518.11878857 -631.53362001 -486.75252057 -539.03596524 -611.44215593
 -410.43165152 -390.57063487 -464.54696655 -484.15649314 -462.82886106
 -636.83052276 -558.06951228 -507.02457903 -453.34884767 -562.66855387
 -405.29464878 -413.14388525 -350.10638528 -502.85702971 -525.62212587
 -519.55204716 -382.23093561 -532.16856145 -401.10969297 -485.28425141
 -605.603784   -581.42358882 -392.2899851  -509.08557094 -667.43829292
 -593.63990793 -624.38810819 -497.08999284 -587.4740101  -523.20319385
 -535.06634422 -586.18838572 -491.06019956 -608.14975461 -497.99935071
          -inf -595.89898347 -485.93972563 -480.89997564 -424.8446862
 -504.87132425 -487.69055621 -658.94747821          -inf          -inf
 -641.11022999 -341.0397741  -613.52177566 -600.79929292 -455.08352907
 -531.0443066  -496.60353839 -603.09910009 -518.27158361 -485.81926434
 -446.02585033 -617.26185348          -inf -492.11710758 -509.52974428]
  warnings.warn(
/Users/maternus/.pyenv/versions/3.10.13/lib/python3.10/site-packages/sklearn/model_selection/_search.py:1062: RuntimeWarning: invalid value encountered in subtract
  (array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights
visualize_kde(kde, params['bandwidth'], X_test, y_test)

Exercise: Isolate Anomalies in House Prices

You are a company resposible to estimate house prices around Ames, Iowa, specifically around college area. But there is a problem: houses from a nearby area, 'Veenker', are often included in your dataset. You want to build an anomaly detection algorithm that filters one by one every point that comes from the wrong neighborhood. You have been able to isolate an X_train dataset which, you are sure, contains only houses from College area. Following the previous example, test your ability to isolate anomalies in new incoming data (X_test) with KDE.

Advanced exercise: What happens if the contamination comes from other areas? You can choose among the following names:

OldTown, Veenker, Edwards, MeadowV, Somerst, NPkVill, BrDale, Gilbert, NridgHt, Sawyer, Blmngtn, Blueste

X_train, X_test, y_test = get_house_prices_data(neighborhood = 'CollgCr', anomaly_neighborhood='Veenker')
X_train
LotArea SalePrice OverallCond
0 8461 163990 5
1 10335 204000 5
2 9548 237000 6
3 9245 145000 5
4 15523 133500 6
... ... ... ...
115 9240 287000 5
116 11317 180000 5
117 4426 141000 5
118 9066 230000 5
119 7990 110000 6

120 rows × 3 columns

# Total data
train_test_data = X_train._append(X_test, ignore_index=True)
y_total = [0] * len(X_train) + y_test

fig = px.scatter_3d(train_test_data, x='LotArea', y='OverallCond', z='SalePrice', color=y_total)

fig.show()

Solution

# when data are highly in-homogeneous, like in this case, it is often beneficial 
# to rescale them before applying any anomaly detection or clustering technique.
scaler = MinMaxScaler()
X_train_rescaled = scaler.fit_transform(X_train)
param_space = {
    'kernel': ['gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', 'cosine'], # Add available kernels
    'bandwidth': np.linspace(0.1, 10, 100), # Define Search space for bandwidth parameter
}
params, kde = hyperopt_by_score(X_train_rescaled, param_space)
/Users/maternus/.pyenv/versions/3.10.13/lib/python3.10/site-packages/sklearn/model_selection/_search.py:1051: UserWarning:

One or more of the test scores are non-finite: [ -45.25900425  -42.41544626 -105.24271623  -76.62262594  -58.16553273
  -52.66553234 -183.01584751 -119.35431748  -98.69187821 -168.03134231
 -111.03706006  -62.91122917 -234.261542   -168.22031548 -220.86820955
  -92.32919548 -154.69523369 -227.42928936  -84.284484   -142.67107811
 -117.16427593 -113.6970155  -227.52633839 -161.95663169  -41.52019748
 -136.74875435 -152.97004208 -128.4919998    19.20525133          -inf
  -98.87203905          -inf -159.80490645 -162.17445626  -67.13120683
 -117.6390077  -140.62181147  -50.67220638 -237.44899903 -197.22483009
 -123.50498573 -188.99783275 -101.07642949  -72.83784089 -229.7863771
 -132.07211645 -168.26256671 -230.06793251 -135.31495507 -187.61056982
 -147.31823309          -inf -146.00497938  -33.29831913 -194.93892381
  -96.25876027 -178.48444701 -123.12220664  -83.77069893 -199.88529605
 -170.61800732 -186.74828407 -134.23720459  -35.0511072  -131.81801061
 -224.69026195 -164.15751703          -inf -217.86235331  -79.81216211
 -124.69089389  -13.75418293 -192.81244316 -167.46002124  -72.58312108
 -160.42768007          -inf -229.19908446 -159.69783332 -199.44038951
 -196.43550278 -135.24648056  -71.43898844 -191.77357892 -177.71452367
 -153.08130804  -64.75586096 -151.5744935  -104.68544216 -107.00124511
 -192.57805657    5.68816971 -158.41708204  -30.07922034 -203.15690702
 -165.74543603 -155.17305524 -243.42948442 -142.66462637 -179.45090448]

/Users/maternus/.pyenv/versions/3.10.13/lib/python3.10/site-packages/sklearn/model_selection/_search.py:1062: RuntimeWarning:

invalid value encountered in subtract

print('Best parameters:')
for key in params:
    print('{}: {}'.format(key, params[key]))

X_test_rescaled = scaler.transform(X_test)
test_scores = -kde.score_samples(X_test_rescaled)
test_scores = np.where(test_scores == np.inf, np.max(test_scores[np.isfinite(test_scores)])+1, test_scores)
curves = evaluate(y_test, test_scores)
Best parameters:
kernel: epanechnikov
bandwidth: 0.4

Exercise: The Curse of Dimensionality

  • The very slow convergence in high dimensions does not necessary mean that we will see bad results in high dimensional anomaly detection with KDE.
  • Especially if the anomalies are very outlying.
  • However, in cases where contours of the nominal distribution are non-convex we can run into problems.

We take a look at a higher dimensional version of out previous data set.

dists = create_distributions(dim=3)

distribution_with_anomalies = contamination(
    nominal=dists['Sinusoidal'],
    anomaly=dists['Blob'],
    p=0.01
)

sample = distribution_with_anomalies.sample(500)

y = sample[0]
X = sample[-1]
fig = px.scatter_3d(x=X[:, 0], y=X[:, 1], z=X[:, 2], color=y)
fig.show()
# Fit KDE on high dimensional examples 
rocs = []
auprs = []
bandwidths = []

param_space = {
        'kernel': ['gaussian'],
        'bandwidth': np.linspace(0.1, 100, 1000), # Define Search space for bandwidth parameter
    }

kdes = {}
dims = np.arange(2,16)
for d in tqdm(dims):
    # Generate d dimensional distributions
    dists = create_distributions(dim=d)

    distribution_with_anomalies = contamination(
        nominal=dists['Sinusoidal'],
        anomaly=dists['Blob'],
        p=0
    )

    # Train on clean data
    sample_train = dists['Sinusoidal'].sample(500)
    X_train = sample_train[-1].numpy()
    # Test data
    sample_test = distribution_with_anomalies.sample(500)
    X_test = sample_test[-1].numpy()
    y_test = sample_test[0].numpy()

    # Optimize bandwidth
    params, kde = hyperopt_by_score(X_train, param_space)
    kdes[d] = (params, kde)
    
    bandwidths.append(params['bandwidth'])

    test_scores = -kde.score_samples(X_test)
    test_scores = np.where(test_scores == np.inf, np.max(test_scores[np.isfinite(test_scores)])+1, test_scores)

    
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14/14 [00:13<00:00,  1.01it/s]
# Plot the cross section of pdf 
fig, axes = plt.subplots(nrows=2, ncols=7, figsize=(15, 5))
for d, axis in tqdm(list(zip(kdes, axes.flatten()))):
    
    params, kde = kdes[d]

    lin = np.linspace(-10, 10, 50)
    grid_points = list(it.product(*([[0]]*(d-2)), lin, lin))
    ys, xs = np.meshgrid(lin, lin)
    # The score function of sklearn returns log-densities
    scores = np.exp(kde.score_samples(grid_points)).reshape(50, 50)
    colormesh = axis.contourf(xs, ys, scores)
    axis.set_title("Dim = {}".format(d))
    axis.set_aspect('equal')
    

# Plot evaluation
print('Crossection of the KDE at (0,...,0, x, y)')
plt.show()
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 23.71it/s]
Crossection of the KDE at (0,...,0, x, y)

Test it out - Huber and Hampel Loss

Huber Loss

Switch from quadratic to linear loss at prescribed threshold.

import numpy as np


def huber(error: float, threshold: float):
    """Huber loss
    
    @param error: base error
    @param threshold: threshold for linear transition
    """
    test = (np.abs(error) <= threshold)
    return (test * (error**2)/2) + ((1-test)*threshold*(np.abs(error) - threshold/2))

x = np.linspace(-5, 5)
y = huber(x, 1)

plt.plot(x, y)
plt.gca().set_title("Huber Loss")
plt.show()

Hampel loss

More complex loss function. Depends on 3 parameters 0 < a < b< r.

import numpy as np

def single_point_hampel(error: float, a: float, b: float, r: float):
    """Hampel loss
    
    @param error: base error
    @param a: 1st threshold parameter
    @param b: 2nd threshold parameter
    @param r: 3rd threshold parameter
    """
    if abs(error) <= a:
        return error**2/2
    elif a < abs(error) <= b:
        return (1/2 *a**2 + a* (abs(error)-a))
    elif  b < abs(error) <= r:
        return a * (2*b-a+(abs(error)-b) * (1+ (r-abs(error))/(r-b)))/2
    else:
        return a*(b-a+r)/2

hampel = np.vectorize(single_point_hampel)

x = np.linspace(-10.1, 10.1)
y = hampel(x, a=1.5, b=3.5, r=8)

plt.plot(x, y)
plt.gca().set_title("Hampel Loss")
plt.show()

Exercise: Distribution Recovery under Contamination

We compare the performance of different approaches to recover the nominal distribution under contamination. Here, we use code by Humbert et al. to replicate the results in the referenced paper on median-of-mean KDE. More details on rKDE can instead be found in this paper by Kim and Scott.

# =======================================================
#   Parameters
# =======================================================
algos = [
    'kde',
    'mom-kde', # Median-of-Means
    'rkde-huber', # robust KDE with huber loss
    'rkde-hampel', # robust KDE with hampel loss
]

dataset = 'house-prices'
dataset_options = {'neighborhood': 'CollgCr', 'anomaly_neighborhood': 'Edwards'}

outlierprop_range = [0.01, 0.02, 0.03, 0.05, 0.07, 0.1, 0.2, 0.3, 0.4, 0.5]
kernel = 'gaussian'
auc_scores = perform_rkde_experiment(
    algos,
    dataset,
    dataset_options,
    outlierprop_range,
    kernel,
)
Dataset:  house-prices
Outlier prop: 0.01 (1 / 10)
downsample outliers
Finding best bandwidth...
Algo:  kde
Algo:  mom-kde
Algo:  rkde-huber
Stop at 4 iterations
Stop at 2 iterations
Algo:  rkde-hampel
Stop at 4 iterations
Stop at 100 iterations

Outlier prop: 0.02 (2 / 10)
downsample outliers
Finding best bandwidth...
Algo:  kde
Algo:  mom-kde
Algo:  rkde-huber
Stop at 4 iterations
Stop at 2 iterations
Algo:  rkde-hampel
Stop at 4 iterations
Stop at 10 iterations

Outlier prop: 0.03 (3 / 10)
downsample outliers
Finding best bandwidth...
Algo:  kde
Algo:  mom-kde
Algo:  rkde-huber
Stop at 3 iterations
Stop at 2 iterations
Algo:  rkde-hampel
Stop at 3 iterations
Stop at 100 iterations

Outlier prop: 0.05 (4 / 10)
downsample outliers
Finding best bandwidth...
Algo:  kde
Algo:  mom-kde
Algo:  rkde-huber
Stop at 5 iterations
Stop at 3 iterations
Algo:  rkde-hampel
Stop at 5 iterations
Stop at 13 iterations

Outlier prop: 0.07 (5 / 10)
downsample outliers
Finding best bandwidth...
Algo:  kde
Algo:  mom-kde
Algo:  rkde-huber
Stop at 4 iterations
Stop at 2 iterations
Algo:  rkde-hampel
Stop at 4 iterations
Stop at 100 iterations

Outlier prop: 0.1 (6 / 10)
downsample outliers
Finding best bandwidth...
Algo:  kde
Algo:  mom-kde
Algo:  rkde-huber
Stop at 4 iterations
Stop at 3 iterations
Algo:  rkde-hampel
Stop at 4 iterations
Stop at 100 iterations

Outlier prop: 0.2 (7 / 10)
downsample outliers
Finding best bandwidth...
Algo:  kde
Algo:  mom-kde
Algo:  rkde-huber
Stop at 5 iterations
Stop at 3 iterations
Algo:  rkde-hampel
Stop at 5 iterations
Stop at 100 iterations

Outlier prop: 0.3 (8 / 10)
downsample outliers
Finding best bandwidth...
Algo:  kde
Algo:  mom-kde
Algo:  rkde-huber
Stop at 4 iterations
Stop at 3 iterations
Algo:  rkde-hampel
Stop at 4 iterations
Stop at 15 iterations

Outlier prop: 0.4 (9 / 10)
downsample outliers
Finding best bandwidth...
Algo:  kde
Algo:  mom-kde
Algo:  rkde-huber
Stop at 4 iterations
Stop at 2 iterations
Algo:  rkde-hampel
Stop at 4 iterations
Stop at 100 iterations

Outlier prop: 0.5 (10 / 10)
downsample inliers
Finding best bandwidth...
Algo:  kde
Algo:  mom-kde
Algo:  rkde-huber
Stop at 3 iterations
Stop at 2 iterations
Algo:  rkde-hampel
Stop at 3 iterations
Stop at 100 iterations
fig, ax = plt.subplots(figsize=(7, 5))
for algo, algo_data in auc_scores.groupby('algo'):
    algo_data = algo_data.drop("algo", axis=1)
    x = algo_data.groupby('outlier_prop').mean().index
    y = algo_data.groupby('outlier_prop').mean()['auc_anomaly']
    ax.plot(x, y, 'o-', label=algo)
plt.legend()
plt.xlabel('outlier_prop')
plt.ylabel('auc_score')
plt.title('Comparison of rKDE against contamination')
Text(0.5, 1.0, 'Comparison of rKDE against contamination')

Try using different neighborhoods for contamination. Which robust KDE algorithm performs better overall? Choose among the following options:

OldTown, Veenker, Edwards, MeadowV, Somerst, NPkVill, BrDale, Gilbert, NridgHt, Sawyer, Blmngtn, Blueste

You can also change the kernel type: gaussian, tophat, epechenikov, exponential, linear or cosine,

Snow